Statistical data analysis pt. 1

# Reading the pre-prepared dataset

all <- read_excel("che_covid19.xlsx")[,2:40] %>% relocate(mortality, .before = `Country Code`)
all
# There are some extra variables, but we will not use all of them

to_use <- c("mortality", "che", "beds", "pop65", "popdens", "urban", "dphe", "dghe", "tobacco", "procur", "doctors", "nurses", "beh_stayhome", "beh_socgathering", "beh_distance", "beh_tellsymp", "beh_handwash", "fob_social", "fob_handshake", "fob_stores", "fob_curfew", "perceivedreaction_d", "govtrust_d", "govfact_d", "perceivedeffectiveness_d", "region", "population")

# Selecting columns we plan to use

data <- subset(all, select=to_use)
data
# Alleviating the problems in the next function caused by "_" in column names
colnames(data) <- gsub("_", ".", colnames(data))

# Calculating descriptive statistics
number <- function(x, na.rm = TRUE){return(sum(!is.na(x)))}

stats <- data %>%
  summarise(across(where(is.numeric), 
                   list(mean = mean, median = median, sd = sd, min = min, max = max, Q1=~quantile(., probs = 0.25), Q3=~quantile(., probs = 0.75)), 
                   na.rm = TRUE)) %>% 
  pivot_longer(everything(), names_to = "name", values_to = "value") %>% 
  separate(name, c("variable", "statistic"), sep = "_") %>%
  pivot_wider(names_from = statistic, values_from = value) %>%
  arrange(variable) %>% 
  select(variable, mean, sd, min, Q1, median, Q3, max)

# Changing column names back
colnames(data) <- gsub("\\.", "_", colnames(data))
data

# Changing variable names to match
stats$variable <- gsub("\\.", "_", stats$variable)

options(scipen=999) # Disabling scientific notation (e.g., e+01)

# Putting variables in the original order
stats = na.omit(stats[match(to_use, stats$variable),])
stats
round_df <- function(x, digits) {
    # round all numeric variables
    # x: data frame 
    # digits: number of digits to round
    numeric_columns <- sapply(x, mode) == 'numeric'
    x[numeric_columns] <-  round(x[numeric_columns], digits)
    x
}

# Rounding the numbers
round_df(stats, 3)
variable mean sd min Q1 median Q3 max
mortality 134.616 106.706 0.390 50.638 125.660 204.732 605.680
che 7.623 2.580 2.495 5.689 7.540 9.260 16.885
beds 3.871 2.506 0.630 2.203 3.120 5.128 13.050
pop65 14.778 5.963 1.523 9.130 15.593 19.762 28.002
popdens 296.441 1043.790 3.576 34.626 99.851 218.329 8044.526
urban 75.385 16.109 18.585 66.545 79.732 87.029 100.000
dphe 35.833 14.427 13.667 25.914 34.440 48.149 70.604
dghe 63.963 14.562 28.732 51.746 64.810 74.086 85.323
tobacco 24.247 8.766 7.900 18.325 23.500 28.875 44.700
procur 0.517 0.504 0.000 0.000 1.000 1.000 1.000
doctors 33.812 16.049 4.650 23.620 32.370 43.598 80.130
nurses 69.248 50.026 2.800 26.465 61.645 102.375 216.700
beh_stayhome 83.462 7.333 58.678 81.242 84.828 87.905 94.411
beh_socgathering 92.326 4.991 75.297 90.952 94.353 95.561 99.000
beh_distance 78.763 9.116 47.866 74.073 81.322 85.271 90.561
beh_tellsymp 92.846 4.281 78.447 92.823 94.256 95.094 97.724
beh_handwash 91.686 2.830 83.720 90.474 92.055 93.745 96.566
fob_social 0.976 0.038 0.786 0.976 0.989 0.994 1.000
fob_handshake 0.967 0.037 0.745 0.959 0.977 0.986 1.000
fob_stores 0.806 0.167 0.197 0.768 0.870 0.910 0.971
fob_curfew 0.714 0.194 0.159 0.592 0.741 0.875 0.990
perceivedreaction_d 0.403 0.233 0.000 0.232 0.357 0.565 0.908
govtrust_d 0.575 0.245 0.090 0.376 0.584 0.799 0.957
govfact_d 0.633 0.239 0.092 0.491 0.709 0.823 0.979
perceivedeffectiveness_d 0.888 0.054 0.696 0.854 0.900 0.926 0.966
population 64586735.900 187687848.304 360563.000 5427584.250 10730269.500 47935001.500 1397715000.000
# Computing the correlation matrix for the numeric variables (all except region)
CorMatrix = cor(data[, !names(data) %in% c("region")] , use = "complete.obs")

cor.mtest <- function(mat, ...) {
    mat <- as.matrix(mat)
    n <- ncol(mat)
    p.mat<- matrix(NA, n, n)
    diag(p.mat) <- 0
    for (i in 1:(n - 1)) {
        for (j in (i + 1):n) {
            tmp <- cor.test(mat[, i], mat[, j], ...)
            p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
        }
    }
  colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
  p.mat
}

# Matrix of p-values (H0: correlation = 0)
p.mat <- cor.mtest(data[, !names(data) %in% c("region")])

CorMatrix<-round(CorMatrix,2)

col <- colorRampPalette(c("deeppink", "hotpink", "lightpink", "floralwhite", "darkseagreen1", "darkslategray2", "dodgerblue4"))

#png(file="corr.png", res=300, width=4500, height=4500)
corrplot(CorMatrix, method="color", col=col(200),  
         type="upper", order="original", 
         addCoef.col = "black", # Add coefficient of correlation
         tl.col="black", tl.srt=45, # Text label color and rotation
         # Combine with significance
         p.mat = p.mat, sig.level = c(0.01, 0.05, 0.1), insig ="label_sig",
         # Hide correlation coefficient on the principal diagonal
         diag=FALSE , number.cex=0.9, tl.cex=1, pch.cex=1)

Correlogram

# Exploring region

data %>% ggplot(aes(y = mortality, x = region)) + 
  geom_boxplot()

# Drawing a map
theme_set(theme_bw())

thismap = map_data("world")

all$`Country Name` <- recode(all$`Country Name`, "'Egypt, Arab Rep.' = 'Egypt'; 'United Kingdom' = 'UK'; 'Korea, Rep.' = 'South Korea'; 'Russian Federation' = 'Russia'; 'Slovak Republic' = 'Slovakia'; 'United States' = 'USA'")

# Setting colors
thismap <- mutate(thismap, fill = ifelse(region %in% all$`Country Name`[all$region == 'Americas'], "#FF7F11", ifelse(region %in% all$`Country Name`[all$region == 'Europe'], "#1446A0", ifelse(region %in% all$`Country Name`[all$region == 'Western Pacific'], "#DB3069", ifelse(region %in% all$`Country Name`[all$region == 'South-East Asia'], "#00AF54", ifelse(region %in% all$`Country Name`[all$region == 'Eastern Mediterranean'], "#F5D547", "white"))))))

# Using scale_fill_identity to set correct colors
#png(file="map.png", res=300, width=4500, height=3000)
ggplot(thismap, aes(long, lat, fill = fill, group=group)) + 
  geom_polygon(colour="gray") + 
  scale_fill_identity("WHO Region", guide = "legend", labels = c("South-East Asia", "Europe", "Western Pacific", "Eastern Mediterranean", "Americas", "unavailable")) +
theme(legend.position = "bottom", legend.key.size = unit(1,"cm"),   legend.title=element_text(size=30), 
    legend.text=element_text(size=25))

World Map

formattable(data %>% group_by(region) %>%
     summarise(no_rows = length(region)))
region no_rows
Americas 12
Eastern Mediterranean 7
Europe 33
South-East Asia 2
Western Pacific 6
# There is too few instances of the three regions, so it makes sense to make an "other" category.

data = data %>% mutate(region = case_when(data$region=="Americas" ~ "Americas", data$region=="Europe" ~ "Europe", TRUE ~ "Other"))
# Exploring procur
data %>% ggplot(aes(y = mortality, x =  as.factor(procur))) + 
  geom_boxplot() + scale_x_discrete(name = "procur")

# Drawing scatterplots of everything with mortality
#png(file="scatter.png", res=300, width=6000, height=8000)
data %>% dplyr::select(c(1:25, 27)) %>% 
  gather(-mortality, key = "var", value = "value") %>%
  ggplot(aes(x = value, y = mortality)) +
    geom_point() +
    facet_wrap(~ var, ncol=3, scales = "free", shrink=TRUE) +
    theme_bw() + 
    theme(axis.text = element_text(size = 14),
          axis.title = element_text( size = 16, face = "bold" ),
          legend.position="none",
          strip.text = element_text(size = 20))

Scatterplots

Statistical data analysis pt. 2

# Doing the same things again with the newly rebuilt dataset
allnew <- read_excel("che_covid19-new.xlsx")[,2:37] %>% relocate(mortality, .before = `Country Code`)
# Now when we select countries with no less than 20 respondents in the Global Behaviors and Perceptions survey, there are 96 complete observations
allnew <- filter(allnew, n >= 20)
allnew
# There are some extra variables, but we will not use all of them

to_use1 <- c("mortality", "che", "pop65", "popdens", "urban", "dphe", "dghe", "doctors", "nurses", "beh_stayhome", "beh_socgathering", "beh_distance", "beh_tellsymp", "beh_handwash", "fob_social", "fob_handshake", "fob_stores", "fob_curfew", "perceivedreaction_d", "govtrust_d", "govfact_d", "perceivedeffectiveness_d", "region", "population")

# Selecting columns we plan to use

datanew <- subset(allnew, select=to_use1)
datanew
# Alleviating the problems in the next function caused by "_" in column names
colnames(datanew) <- gsub("_", ".", colnames(datanew))

# Calculating descriptive statistics
number <- function(x, na.rm = TRUE){return(sum(!is.na(x)))}

statsnew <- datanew %>%
  summarise(across(where(is.numeric), 
                   list(mean = mean, median = median, sd = sd, min = min, max = max, Q1=~quantile(., probs = 0.25), Q3=~quantile(., probs = 0.75)), 
                   na.rm = TRUE)) %>% 
  pivot_longer(everything(), names_to = "name", values_to = "value") %>% 
  separate(name, c("variable", "statistic"), sep = "_") %>%
  pivot_wider(names_from = statistic, values_from = value) %>%
  arrange(variable) %>% 
  select(variable, mean, sd, min, Q1, median, Q3, max)

# Changing column names back
colnames(datanew) <- gsub("\\.", "_", colnames(datanew))
datanew

# Changing variable names to match
statsnew$variable <- gsub("\\.", "_", statsnew$variable)

options(scipen=999) # Disabling scientific notation (e.g., e+01)

# Putting variables in the original order
statsnew = na.omit(statsnew[match(to_use1, statsnew$variable),])
statsnew
round_df <- function(x, digits) {
    # round all numeric variables
    # x: data frame 
    # digits: number of digits to round
    numeric_columns <- sapply(x, mode) == 'numeric'
    x[numeric_columns] <-  round(x[numeric_columns], digits)
    x
}

# Rounding the numbers
round_df(statsnew, 3)
variable mean sd min Q1 median Q3 max
mortality 118.112 101.068 0.390 30.935 105.145 184.107 605.680
che 7.005 2.522 2.343 5.280 6.878 8.663 16.885
pop65 12.195 6.668 1.157 6.429 12.111 18.753 28.002
popdens 268.424 856.981 3.298 46.360 100.047 220.683 8044.526
urban 68.466 20.491 17.313 56.996 71.190 83.755 100.000
dphe 39.795 16.242 11.957 26.599 39.478 49.723 77.270
dghe 57.860 18.333 14.867 45.247 59.586 73.157 88.043
doctors 27.952 17.935 0.600 12.657 26.290 40.515 80.130
nurses 58.149 46.944 2.800 19.030 51.520 74.270 216.700
beh_stayhome 82.679 8.712 48.359 79.057 84.243 88.498 95.720
beh_socgathering 91.259 5.792 70.304 88.162 93.192 95.545 99.300
beh_distance 76.325 9.914 47.866 69.791 77.600 84.310 92.067
beh_tellsymp 92.308 4.673 78.447 90.600 93.771 95.013 99.289
beh_handwash 91.494 3.286 80.600 90.182 91.975 93.745 97.921
fob_social 0.978 0.033 0.786 0.977 0.989 0.995 1.000
fob_handshake 0.965 0.045 0.661 0.956 0.975 0.985 1.000
fob_stores 0.814 0.144 0.197 0.775 0.858 0.901 0.971
fob_curfew 0.747 0.179 0.159 0.645 0.776 0.892 1.000
perceivedreaction_d 0.403 0.245 0.000 0.215 0.368 0.565 0.952
govtrust_d 0.540 0.251 0.040 0.370 0.524 0.771 0.957
govfact_d 0.600 0.244 0.092 0.400 0.653 0.803 0.979
perceivedeffectiveness_d 0.873 0.070 0.619 0.837 0.890 0.922 1.000
population 69540265.083 201817550.622 360563.000 5658078.250 12160829.500 53931840.500 1397715000.000
# Computing the correlation matrix for the numeric variables (all except region)
CorMatrix1 = cor(datanew[, !names(datanew) %in% c("region")] , use = "complete.obs")

cor.mtest <- function(mat, ...) {
    mat <- as.matrix(mat)
    n <- ncol(mat)
    p.mat<- matrix(NA, n, n)
    diag(p.mat) <- 0
    for (i in 1:(n - 1)) {
        for (j in (i + 1):n) {
            tmp <- cor.test(mat[, i], mat[, j], ...)
            p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
        }
    }
  colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
  p.mat
}

# Matrix of p-values (H0: correlation = 0)
p.mat1 <- cor.mtest(datanew[, !names(datanew) %in% c("region")])

CorMatrix1<-round(CorMatrix1,2)

col <- colorRampPalette(c("deeppink", "hotpink", "lightpink", "floralwhite", "darkseagreen1", "darkslategray2", "dodgerblue4"))

#png(file="newcorr.png", res=300, width=4500, height=4500)
corrplot(CorMatrix1, method="color", col=col(200),  
         type="upper", order="original", 
         addCoef.col = "black", # Add coefficient of correlation
         tl.col="black", tl.srt=45, # Text label color and rotation
         # Combine with significance
         p.mat = p.mat1, sig.level = c(0.01, 0.05, 0.1), insig ="label_sig",
         # Hide correlation coefficient on the principal diagonal
         diag=FALSE , number.cex=0.9, tl.cex=1, pch.cex=1)
# Заново каждый раз, когда library(corrplot)
# trace(corrplot, edit=TRUE)
# заменить на 443 строке
# place_points = function(sig.locs, point) {
  #text(pos.pNew[, 1][sig.locs], pos.pNew[, 2][sig.locs], 
      # labels = point, col = pch.col, cex = pch.cex, 
       #lwd = 2)
# НА
#place_points = function(sig.locs, point) {
      #text(pos.pNew[, 1][sig.locs], (pos.pNew[, 2][sig.locs])+0.25, 
           #labels = point, col = pch.col, cex = pch.cex, 
           #lwd = 2)

New Correlogram

formattable(datanew %>% group_by(region) %>%
     summarise(no_rows = length(region)))
region no_rows
Africa 7
Americas 18
Eastern Mediterranean 13
Europe 43
South-East Asia 6
Western Pacific 9
# Exploring region

datanew %>% ggplot(aes(y = mortality, x = region)) + 
  geom_boxplot() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Drawing scatterplots of everything with mortality
#png(file="newscatter.png", res=300, width=6000, height=8000)
datanew %>% dplyr::select(c(1:22, 24)) %>% 
  gather(-mortality, key = "var", value = "value") %>%
  ggplot(aes(x = value, y = mortality)) +
    geom_point() +
    facet_wrap(~ var, ncol=3, scales = "free", shrink=TRUE) +
    theme_bw() + 
    theme(axis.text = element_text(size = 14),
          axis.title = element_text( size = 16, face = "bold" ),
          legend.position="none",
          strip.text = element_text(size = 20))

New Scatterplots

# Drawing a map
theme_set(theme_bw())

thismap1 = map_data("world")

allnew

allnew$`Country Name` <- recode(allnew$`Country Name`, "'Egypt, Arab Rep.' = 'Egypt'; 'United Kingdom' = 'UK'; 'Korea, Rep.' = 'South Korea'; 'Russian Federation' = 'Russia'; 'Slovak Republic' = 'Slovakia'; 'United States' = 'USA'; 'Iran, Islamic Rep.' = 'Iran'")

# Setting colors
thismap1 <- mutate(thismap1, fill = ifelse(region %in% allnew$`Country Name`[allnew$region == 'Americas'], "#FF7F11", ifelse(region %in% allnew$`Country Name`[allnew$region == 'Europe'], "#1446A0", ifelse(region %in% allnew$`Country Name`[allnew$region == 'Western Pacific'], "#DB3069", ifelse(region %in% allnew$`Country Name`[allnew$region == 'South-East Asia'], "#00AF54", ifelse(region %in% allnew$`Country Name`[allnew$region == 'Eastern Mediterranean'], "#F5D547", "white"))))))

# Using scale_fill_identity to set correct colors
#png(file="newmap.png", res=300, width=4500, height=3000)
ggplot(thismap1, aes(long, lat, fill = fill, group=group)) + 
  geom_polygon(colour="gray") + 
  scale_fill_identity("WHO Region", guide = "legend", labels = c("South-East Asia", "Europe", "Western Pacific", "Eastern Mediterranean", "Americas", "unavailable")) +
theme(legend.position = "bottom", legend.key.size = unit(1,"cm"),   legend.title=element_text(size=30), 
    legend.text=element_text(size=25))

New World Map

# Drawing a scatterplot over a limited range of density for better visibility
datanew %>% 
  ggplot(aes(popdens, mortality)) +
  geom_jitter(width = 0.25, alpha = 0.5) + xlim(0, 1000)

# Drawing a scatterplot over a limited range of population for better visibility
datanew %>% 
  ggplot(aes(population, mortality)) +
  geom_jitter(width = 0.25, alpha = 0.5) +  
  scale_x_continuous(labels = unit_format(unit = "M", scale = 1e-6), limits = c(0, 200000000), name = "population (mln.)")